From 15f43332e77eb5a292550debecd77ef6e6203b3b Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Thu, 10 Feb 2005 11:26:37 +0000 Subject: [PATCH] bitkeeper revision 1.1159.1.564 (420b44edsb8XzPev-TiGW16GSsCW6g) More x86_64 stuff. Added hypercalls to register a user-space pagetable, modify FS/GS base addresses, and switch to user mode. User mode switches back to kernel mode automatically on executing SYSCALL instruction. Still todo: 1. getdomaininfo needs to include pagetable_user 2. get writable and shadow pagetables working 3. testing Signed-off-by: keir.fraser@cl.cam.ac.uk --- xen/arch/x86/domain.c | 320 +++++++++++++++------------ xen/arch/x86/mm.c | 22 ++ xen/arch/x86/setup.c | 4 +- xen/arch/x86/vmx_io.c | 2 +- xen/arch/x86/vmx_vmcs.c | 2 +- xen/arch/x86/x86_32/entry.S | 18 +- xen/arch/x86/x86_64/entry.S | 45 ++-- xen/arch/x86/x86_64/mm.c | 30 ++- xen/arch/x86/x86_64/traps.c | 26 ++- xen/include/asm-x86/domain.h | 1 + xen/include/asm-x86/msr.h | 2 +- xen/include/asm-x86/processor.h | 6 +- xen/include/asm-x86/x86_32/current.h | 6 +- xen/include/asm-x86/x86_64/current.h | 6 +- xen/include/public/arch-x86_64.h | 32 ++- xen/include/public/xen.h | 11 +- 16 files changed, 335 insertions(+), 198 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 59c9dc1e92..294ed178c1 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -256,6 +256,8 @@ void arch_do_createdomain(struct exec_domain *ed) SET_DEFAULT_FAST_TRAP(&ed->arch); + ed->arch.flags = TF_kernel_mode; + if ( d->id == IDLE_DOMAIN_ID ) { ed->arch.schedule_tail = continue_idle_task; @@ -287,8 +289,6 @@ void arch_do_createdomain(struct exec_domain *ed) d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] = mk_l3_pgentry(__pa(d->arch.mm_perdomain_l2) | __PAGE_HYPERVISOR); #endif - - ed->arch.flags = TF_kernel_mode; } } @@ -550,6 +550,172 @@ void new_thread(struct exec_domain *d, } +#ifdef __x86_64__ + +#define loadsegment(seg,value) ({ \ + int __r = 1; \ + __asm__ __volatile__ ( \ + "1: movl %k1,%%" #seg "\n2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: xorl %k0,%k0\n" \ + " movl %k0,%%" #seg "\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 8\n" \ + " .quad 1b,3b\n" \ + ".previous" \ + : "=r" (__r) : "r" (value), "0" (__r) );\ + __r; }) + +static void switch_segments( + struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n) +{ + int all_segs_okay = 1; + + if ( !is_idle_task(p->domain) ) + { + __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) ); + __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) ); + __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) ); + __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) ); + } + + /* Either selector != 0 ==> reload. */ + if ( unlikely(p->arch.user_ctxt.ds | + n->arch.user_ctxt.ds) ) + all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds); + + /* Either selector != 0 ==> reload. */ + if ( unlikely(p->arch.user_ctxt.es | + n->arch.user_ctxt.es) ) + all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es); + + /* + * Either selector != 0 ==> reload. + * Also reload to reset FS_BASE if it was non-zero. + */ + if ( unlikely(p->arch.user_ctxt.fs | + p->arch.user_ctxt.fs_base | + n->arch.user_ctxt.fs) ) + { + all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs); + if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */ + p->arch.user_ctxt.fs_base = 0; + } + + /* + * Either selector != 0 ==> reload. + * Also reload to reset GS_BASE if it was non-zero. + */ + if ( unlikely(p->arch.user_ctxt.gs | + p->arch.user_ctxt.gs_base_user | + n->arch.user_ctxt.gs) ) + { + /* Reset GS_BASE with user %gs? */ + if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user ) + all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs); + if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */ + p->arch.user_ctxt.gs_base_user = 0; + } + + /* This can only be non-zero if selector is NULL. */ + if ( n->arch.user_ctxt.fs_base ) + wrmsr(MSR_FS_BASE, + n->arch.user_ctxt.fs_base, + n->arch.user_ctxt.fs_base>>32); + + /* This can only be non-zero if selector is NULL. */ + if ( n->arch.user_ctxt.gs_base_user ) + wrmsr(MSR_GS_BASE, + n->arch.user_ctxt.gs_base_user, + n->arch.user_ctxt.gs_base_user>>32); + + /* This can only be non-zero if selector is NULL. */ + if ( p->arch.user_ctxt.gs_base_kernel | + n->arch.user_ctxt.gs_base_kernel ) + wrmsr(MSR_SHADOW_GS_BASE, + n->arch.user_ctxt.gs_base_kernel, + n->arch.user_ctxt.gs_base_kernel>>32); + + /* If in kernel mode then switch the GS bases around. */ + if ( n->arch.flags & TF_kernel_mode ) + __asm__ __volatile__ ( "swapgs" ); + + if ( unlikely(!all_segs_okay) ) + { + unsigned long *rsp = + (n->arch.flags & TF_kernel_mode) ? + (unsigned long *)regs->rsp : + (unsigned long *)n->arch.kernel_sp; + + if ( put_user(regs->ss, rsp- 1) | + put_user(regs->rsp, rsp- 2) | + put_user(regs->rflags, rsp- 3) | + put_user(regs->cs, rsp- 4) | + put_user(regs->rip, rsp- 5) | + put_user(regs->gs, rsp- 6) | + put_user(regs->fs, rsp- 7) | + put_user(regs->es, rsp- 8) | + put_user(regs->ds, rsp- 9) | + put_user(regs->r11, rsp-10) | + put_user(regs->rcx, rsp-11) ) + { + DPRINTK("Error while creating failsafe callback frame.\n"); + domain_crash(); + } + + if ( !(n->arch.flags & TF_kernel_mode) ) + { + n->arch.flags |= TF_kernel_mode; + __asm__ __volatile__ ( "swapgs" ); + write_ptbase(n); + } + + regs->entry_vector = TRAP_syscall; + regs->rflags &= 0xFFFCBEFFUL; + regs->ss = __GUEST_SS; + regs->rsp = (unsigned long)(rsp-11); + regs->cs = __GUEST_CS; + regs->rip = n->arch.failsafe_address; + } +} + +long do_switch_to_user(void) +{ + struct xen_regs *regs = get_execution_context(); + struct switch_to_user stu; + struct exec_domain *ed = current; + + if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ) + return -EFAULT; + + ed->arch.flags &= ~TF_kernel_mode; + __asm__ __volatile__ ( "swapgs" ); + write_ptbase(ed); + + regs->rip = stu.rip; + regs->cs = stu.cs; + regs->rflags = stu.rflags; + regs->rsp = stu.rsp; + regs->ss = stu.ss; + + if ( !(stu.flags & ECF_IN_SYSCALL) ) + { + regs->entry_vector = 0; + regs->r11 = stu.r11; + regs->rcx = stu.rcx; + } + + return regs->rax; +} + +#elif defined(__i386__) + +#define switch_segments(_r, _p, _n) ((void)0) + +#endif + /* * This special macro can be used to load a debugging register */ @@ -566,21 +732,12 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) #ifdef CONFIG_VMX unsigned long vmx_domain = next_p->arch.arch_vmx.flags; #endif -#ifdef __x86_64__ - int all_segs_okay = 1; -#endif __cli(); /* Switch guest general-register state. */ if ( !is_idle_task(prev_p->domain) ) { -#ifdef __x86_64__ - __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (stack_ec->ds) ); - __asm__ __volatile__ ( "movl %%es,%0" : "=m" (stack_ec->es) ); - __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (stack_ec->fs) ); - __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (stack_ec->gs) ); -#endif memcpy(&prev_p->arch.user_ctxt, stack_ec, sizeof(*stack_ec)); @@ -624,7 +781,7 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) SET_FAST_TRAP(&next_p->arch); #ifdef __i386__ - /* Switch the guest OS ring-1 stack. */ + /* Switch the kernel ring-1 stack. */ tss->esp1 = next_p->arch.kernel_sp; tss->ss1 = next_p->arch.kernel_ss; #endif @@ -660,126 +817,7 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) __sti(); -#ifdef __x86_64__ - -#define loadsegment(seg,value) ({ \ - int __r = 1; \ - __asm__ __volatile__ ( \ - "1: movl %k1,%%" #seg "\n2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: xorl %k0,%k0\n" \ - " movl %k0,%%" #seg "\n" \ - " jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 1b,3b\n" \ - ".previous" \ - : "=r" (__r) : "r" (value), "0" (__r) );\ - __r; }) - - /* Either selector != 0 ==> reload. */ - if ( unlikely(prev_p->arch.user_ctxt.ds) || - unlikely(next_p->arch.user_ctxt.ds) ) - all_segs_okay &= loadsegment(ds, next_p->arch.user_ctxt.ds); - - /* Either selector != 0 ==> reload. */ - if ( unlikely(prev_p->arch.user_ctxt.es) || - unlikely(next_p->arch.user_ctxt.es) ) - all_segs_okay &= loadsegment(es, next_p->arch.user_ctxt.es); - - /* - * Either selector != 0 ==> reload. - * Also reload to reset FS_BASE if it was non-zero. - */ - if ( unlikely(prev_p->arch.user_ctxt.fs) || - unlikely(prev_p->arch.user_ctxt.fs_base) || - unlikely(next_p->arch.user_ctxt.fs) ) - { - all_segs_okay &= loadsegment(fs, next_p->arch.user_ctxt.fs); - if ( prev_p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */ - prev_p->arch.user_ctxt.fs_base = 0; - } - - /* - * Either selector != 0 ==> reload. - * Also reload to reset GS_BASE if it was non-zero. - */ - if ( unlikely(prev_p->arch.user_ctxt.gs) || - unlikely(prev_p->arch.user_ctxt.gs_base_os) || - unlikely(prev_p->arch.user_ctxt.gs_base_app) || - unlikely(next_p->arch.user_ctxt.gs) ) - { - /* Reset GS_BASE with user %gs. */ - all_segs_okay &= loadsegment(gs, next_p->arch.user_ctxt.gs); - /* Reset KERNEL_GS_BASE if we won't be doing it later. */ - if ( !next_p->arch.user_ctxt.gs_base_os ) - wrmsr(MSR_KERNEL_GS_BASE, 0, 0); - if ( prev_p->arch.user_ctxt.gs ) /* != 0 selector kills app gs_base */ - prev_p->arch.user_ctxt.gs_base_app = 0; - } - - /* This can only be non-zero if selector is NULL. */ - if ( next_p->arch.user_ctxt.fs_base ) - wrmsr(MSR_FS_BASE, - next_p->arch.user_ctxt.fs_base, - next_p->arch.user_ctxt.fs_base>>32); - - /* This can only be non-zero if selector is NULL. */ - if ( next_p->arch.user_ctxt.gs_base_os ) - wrmsr(MSR_KERNEL_GS_BASE, - next_p->arch.user_ctxt.gs_base_os, - next_p->arch.user_ctxt.gs_base_os>>32); - - /* This can only be non-zero if selector is NULL. */ - if ( next_p->arch.user_ctxt.gs_base_app ) - wrmsr(MSR_GS_BASE, - next_p->arch.user_ctxt.gs_base_app, - next_p->arch.user_ctxt.gs_base_app>>32); - - /* If in guest-OS mode, switch the GS bases around. */ - if ( next_p->arch.flags & TF_kernel_mode ) - __asm__ __volatile__ ( "swapgs" ); - - if ( unlikely(!all_segs_okay) ) - { - unsigned long *rsp = - (next_p->arch.flags & TF_kernel_mode) ? - (unsigned long *)stack_ec->rsp : - (unsigned long *)next_p->arch.kernel_sp; - - if ( put_user(stack_ec->ss, rsp- 1) | - put_user(stack_ec->rsp, rsp- 2) | - put_user(stack_ec->rflags, rsp- 3) | - put_user(stack_ec->cs, rsp- 4) | - put_user(stack_ec->rip, rsp- 5) | - put_user(stack_ec->gs, rsp- 6) | - put_user(stack_ec->fs, rsp- 7) | - put_user(stack_ec->es, rsp- 8) | - put_user(stack_ec->ds, rsp- 9) | - put_user(stack_ec->r11, rsp-10) | - put_user(stack_ec->rcx, rsp-11) ) - { - DPRINTK("Error while creating failsafe callback frame.\n"); - domain_crash(); - } - - if ( !(next_p->arch.flags & TF_kernel_mode) ) - { - next_p->arch.flags |= TF_kernel_mode; - __asm__ __volatile__ ( "swapgs" ); - /* XXX switch page tables XXX */ - } - - stack_ec->entry_vector = TRAP_syscall; - stack_ec->rflags &= 0xFFFCBEFFUL; - stack_ec->ss = __GUEST_SS; - stack_ec->rsp = (unsigned long)(rsp-11); - stack_ec->cs = __GUEST_CS; - stack_ec->rip = next_p->arch.failsafe_address; - } - -#endif /* __x86_64__ */ + switch_segments(stack_ec, prev_p, next_p); } @@ -935,13 +973,23 @@ void domain_relinquish_memory(struct domain *d) /* Exit shadow mode before deconstructing final guest page table. */ shadow_mode_disable(d); - /* Drop the in-use reference to the page-table base. */ + /* Drop the in-use references to page-table bases. */ for_each_exec_domain ( d, ed ) { if ( pagetable_val(ed->arch.pagetable) != 0 ) - put_page_and_type(&frame_table[pagetable_val(ed->arch.pagetable) >> - PAGE_SHIFT]); - ed->arch.pagetable = mk_pagetable(0); + { + put_page_and_type( + &frame_table[pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT]); + ed->arch.pagetable = mk_pagetable(0); + } + + if ( pagetable_val(ed->arch.pagetable_user) != 0 ) + { + put_page_and_type( + &frame_table[pagetable_val(ed->arch.pagetable_user) >> + PAGE_SHIFT]); + ed->arch.pagetable_user = mk_pagetable(0); + } } #ifdef CONFIG_VMX diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index f4e11f5194..123ff0fbda 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -209,6 +209,10 @@ void write_ptbase(struct exec_domain *ed) #else if ( unlikely(shadow_mode(d)) ) pa = pagetable_val(ed->arch.shadow_table); +#ifdef __x86_64__ + else if ( !(ed->arch.flags & TF_kernel_mode) ) + pa = pagetable_val(ed->arch.pagetable_user); +#endif else pa = pagetable_val(ed->arch.pagetable); #endif @@ -1341,6 +1345,24 @@ static int do_extended_command(unsigned long ptr, unsigned long val) okay = new_guest_cr3(pfn); break; +#ifdef __x86_64__ + case MMUEXT_NEW_USER_BASEPTR: + okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d); + if ( unlikely(!okay) ) + { + MEM_LOG("Error while installing new baseptr %p", pfn); + } + else + { + unsigned long old_pfn = + pagetable_val(ed->arch.pagetable_user) >> PAGE_SHIFT; + ed->arch.pagetable_user = mk_pagetable(pfn << PAGE_SHIFT); + if ( old_pfn != 0 ) + put_page_and_type(&frame_table[old_pfn]); + } + break; +#endif + case MMUEXT_TLB_FLUSH: percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; break; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 8c699b4586..ba8e662ad4 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -325,9 +325,9 @@ void __init cpu_init(void) memset(t->io_bitmap, ~0, sizeof(t->io_bitmap)); #if defined(__i386__) t->ss0 = __HYPERVISOR_DS; - t->esp0 = get_stack_top(); + t->esp0 = get_stack_bottom(); #elif defined(__x86_64__) - t->rsp0 = get_stack_top(); + t->rsp0 = get_stack_bottom(); #endif set_tss_desc(nr,t); load_TR(nr); diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c index 3241ada865..9fbaa05298 100644 --- a/xen/arch/x86/vmx_io.c +++ b/xen/arch/x86/vmx_io.c @@ -382,7 +382,7 @@ void vmx_do_resume(struct exec_domain *d) { __vmwrite(HOST_CR3, pagetable_val(d->arch.monitor_table)); __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table)); - __vmwrite(HOST_ESP, (unsigned long) get_stack_top()); + __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom()); if (event_pending(d)) { if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0])) diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c index 07af40ca24..c9f1e9de1b 100644 --- a/xen/arch/x86/vmx_vmcs.c +++ b/xen/arch/x86/vmx_vmcs.c @@ -222,7 +222,7 @@ void vmx_do_launch(struct exec_domain *ed) ed->arch.shadow_table = ed->arch.pagetable; __vmwrite(GUEST_CR3, pagetable_val(ed->arch.pagetable)); __vmwrite(HOST_CR3, pagetable_val(ed->arch.monitor_table)); - __vmwrite(HOST_ESP, (unsigned long) get_stack_top()); + __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom()); ed->arch.schedule_tail = arch_vmx_do_resume; } diff --git a/xen/arch/x86/x86_32/entry.S b/xen/arch/x86/x86_32/entry.S index 45fec67827..3e33befc79 100644 --- a/xen/arch/x86/x86_32/entry.S +++ b/xen/arch/x86/x86_32/entry.S @@ -81,7 +81,7 @@ * (9) u32 fs; * (8) u32 ds; * (7) u32 es; - * <- get_stack_top() (= HOST_ESP) + * <- get_stack_bottom() (= HOST_ESP) * (6) u32 ss; * (5) u32 esp; * (4) u32 eflags; @@ -89,8 +89,8 @@ * (2) u32 eip; * (2/1) u16 entry_vector; * (1/1) u16 error_code; - * However, get_stack_top() acturally returns 20 bytes below the real - * top of the stack to allow space for: + * However, get_stack_bottom() actually returns 20 bytes before the real + * bottom of the stack to allow space for: * domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers. */ #define VMX_MONITOR_EFLAGS 0x202 /* IF on */ @@ -173,8 +173,8 @@ vmx_process_softirqs: ALIGN restore_all_guest: - testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx) - jnz failsafe_callback + btr $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx) + jc failsafe_callback testl $X86_EFLAGS_VM,XREGS_eflags(%esp) jnz restore_all_vm86 FLT1: movl XREGS_ds(%esp),%ds @@ -216,9 +216,8 @@ FIX1: SET_XEN_SEGMENTS(a) DBLFLT1:GET_CURRENT(%ebx) jmp test_all_events DBLFIX1:GET_CURRENT(%ebx) - testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx) - jnz domain_crash # cannot reenter failsafe code - orb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx) + bts $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx) + jc domain_crash # cannot reenter failsafe code jmp test_all_events # will return via failsafe code .previous .section __pre_ex_table,"a" @@ -235,7 +234,6 @@ DBLFIX1:GET_CURRENT(%ebx) /* No special register assumptions */ failsafe_callback: GET_CURRENT(%ebx) - andb $~TF_failsafe_return,EDOMAIN_thread_flags(%ebx) leal EDOMAIN_trap_bounce(%ebx),%edx movl EDOMAIN_failsafe_addr(%ebx),%eax movl %eax,TRAPBOUNCE_eip(%edx) @@ -282,8 +280,6 @@ ENTRY(hypercall) GET_CURRENT(%ebx) andl $(NR_hypercalls-1),%eax call *SYMBOL_NAME(hypercall_table)(,%eax,4) - -ret_from_hypercall: movl %eax,XREGS_eax(%esp) # save the return value test_all_events: diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S index ccda5d7008..8c00a685ad 100644 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -20,8 +20,8 @@ ALIGN restore_all_guest: - testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx) - jnz failsafe_callback + btr $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx) + jc failsafe_callback RESTORE_ALL testw $TRAP_syscall,4(%rsp) jz 1f @@ -50,9 +50,8 @@ FIX1: popq -15*8-8(%rsp) # error_code/entry_vector DBLFLT1:GET_CURRENT(%rbx) jmp test_all_events DBLFIX1:GET_CURRENT(%rbx) - testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx) - jnz domain_crash # cannot reenter failsafe code - orb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx) + bts $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx) + jc domain_crash # cannot reenter failsafe code jmp test_all_events # will return via failsafe code .previous .section __pre_ex_table,"a" @@ -65,7 +64,6 @@ DBLFIX1:GET_CURRENT(%rbx) /* No special register assumptions */ failsafe_callback: GET_CURRENT(%rbx) - andb $~TF_failsafe_return,EDOMAIN_thread_flags(%rbx) leaq EDOMAIN_trap_bounce(%rbx),%rdx movq EDOMAIN_failsafe_addr(%rbx),%rax movq %rax,TRAPBOUNCE_eip(%rdx) @@ -97,8 +95,7 @@ restore_all_xen: * NB. We must move %r10 to %rcx for C function-calling ABI. */ ALIGN -ENTRY(hypercall) - sti +ENTRY(syscall_enter) movl $__GUEST_SS,8(%rsp) pushq %r11 pushq $__GUEST_CS @@ -106,13 +103,20 @@ ENTRY(hypercall) pushq $0 movl $TRAP_syscall,4(%rsp) SAVE_ALL - movq %r10,%rcx - andq $(NR_hypercalls-1),%rax - leaq SYMBOL_NAME(hypercall_table)(%rip),%rbx - callq *(%rbx,%rax,8) GET_CURRENT(%rbx) + bts $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx) + jc hypercall + swapgs + movq %rbx,%rdi + call SYMBOL_NAME(write_ptbase) + jmp restore_all_guest -ret_from_hypercall: +hypercall: + sti + movq %r10,%rcx + andq $(NR_hypercalls-1),%rax + leaq SYMBOL_NAME(hypercall_table)(%rip),%r10 + callq *(%r10,%rax,8) movq %rax,XREGS_rax(%rsp) # save the return value test_all_events: @@ -154,7 +158,7 @@ create_bounce_frame: movq XREGS_rsp+8(%rsp),%rsi testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx) jnz 1f - /* Push new frame at registered guest-OS stack top. */ + /* Push new frame at registered guest-OS stack base. */ movq EDOMAIN_kernel_sp(%rbx),%rsi 1: movq $HYPERVISOR_VIRT_START,%rax cmpq %rax,%rsi @@ -203,11 +207,11 @@ FLT15: movq %rax,(%rsi) # RCX /* Rewrite our stack frame and return to guest-OS mode. */ /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ movb $0,TRAPBOUNCE_flags(%rdx) - testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx) - jnz 1f - orb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx) + bts $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx) + jc 1f swapgs - /* XXX switch page tables XXX */ + movq %rbx,%rdi + call SYMBOL_NAME(write_ptbase) 1: movl $TRAP_syscall,XREGS_entry_vector+8(%rsp) andl $0xfffcbeff,XREGS_eflags+8(%rsp) movl $__GUEST_SS,XREGS_ss+8(%rsp) @@ -425,7 +429,7 @@ ENTRY(hypercall_table) .quad SYMBOL_NAME(do_set_debugreg) .quad SYMBOL_NAME(do_get_debugreg) .quad SYMBOL_NAME(do_update_descriptor) /* 10 */ - .quad SYMBOL_NAME(do_ni_hypercall) # do_set_fast_trap + .quad SYMBOL_NAME(do_ni_hypercall) .quad SYMBOL_NAME(do_dom_mem_op) .quad SYMBOL_NAME(do_multicall) .quad SYMBOL_NAME(do_update_va_mapping) @@ -437,8 +441,9 @@ ENTRY(hypercall_table) .quad SYMBOL_NAME(do_grant_table_op) /* 20 */ .quad SYMBOL_NAME(do_vm_assist) .quad SYMBOL_NAME(do_update_va_mapping_otherdomain) - .quad SYMBOL_NAME(do_ni_hypercall) # do_switch_vm86 + .quad SYMBOL_NAME(do_switch_to_user) .quad SYMBOL_NAME(do_boot_vcpu) + .quad SYMBOL_NAME(do_set_segment_base) /* 25 */ .rept NR_hypercalls-((.-hypercall_table)/4) .quad SYMBOL_NAME(do_ni_hypercall) .endr diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 52230df5bf..f69d06a1cf 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include void *safe_page_alloc(void) { @@ -238,6 +238,34 @@ long do_stack_switch(unsigned long ss, unsigned long esp) return 0; } +long do_set_segment_base(unsigned int which, unsigned long base) +{ + struct exec_domain *ed = current; + + switch ( which ) + { + case SEGBASE_FS: + ed->arch.user_ctxt.fs_base = base; + wrmsr(MSR_FS_BASE, base, base>>32); + break; + + case SEGBASE_GS_USER: + ed->arch.user_ctxt.gs_base_user = base; + wrmsr(MSR_SHADOW_GS_BASE, base, base>>32); + break; + + case SEGBASE_GS_KERNEL: + ed->arch.user_ctxt.gs_base_kernel = base; + wrmsr(MSR_GS_BASE, base, base>>32); + break; + + default: + return -EINVAL; + } + + return 0; +} + /* Returns TRUE if given descriptor is valid for GDT or LDT. */ int check_descriptor(struct desc_struct *d) diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index 96ae0424b4..3c7c8ea7ec 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -153,12 +153,14 @@ asmlinkage void do_double_fault(struct xen_regs *regs) __asm__ __volatile__ ( "hlt" ); } -asmlinkage void hypercall(void); +asmlinkage void syscall_enter(void); void __init percpu_traps_init(void) { - char *stack_top = (char *)get_stack_top(); - char *stack = (char *)((unsigned long)stack_top & ~(STACK_SIZE - 1)); - int cpu = smp_processor_id(); + char *stack_bottom, *stack; + int cpu = smp_processor_id(); + + stack_bottom = (char *)get_stack_bottom(); + stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1)); /* Double-fault handler has its own per-CPU 1kB stack. */ init_tss[cpu].ist[0] = (unsigned long)&stack[1024]; @@ -181,17 +183,17 @@ void __init percpu_traps_init(void) stack[0] = 0x48; stack[1] = 0x89; stack[2] = 0x25; - *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16; + *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16; /* leaq saversp(%rip), %rsp */ stack[7] = 0x48; stack[8] = 0x8d; stack[9] = 0x25; - *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16; + *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16; - /* jmp hypercall */ + /* jmp syscall_enter */ stack[14] = 0xe9; - *(u32 *)&stack[15] = (char *)hypercall - &stack[19]; + *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19]; /* * Trampoline for SYSCALL entry from compatibility mode. @@ -205,17 +207,17 @@ void __init percpu_traps_init(void) stack[0] = 0x48; stack[1] = 0x89; stack[2] = 0x25; - *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16; + *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16; /* leaq saversp(%rip), %rsp */ stack[7] = 0x48; stack[8] = 0x8d; stack[9] = 0x25; - *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16; + *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16; - /* jmp hypercall */ + /* jmp syscall_enter */ stack[14] = 0xe9; - *(u32 *)&stack[15] = (char *)hypercall - &stack[19]; + *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19]; /* * Common SYSCALL parameters. diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index a6e5a9e5b0..d8821f50e3 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -98,6 +98,7 @@ struct arch_exec_domain */ l1_pgentry_t *perdomain_ptes; pagetable_t pagetable; + pagetable_t pagetable_user; /* x86/64: user-space pagetable. */ pagetable_t monitor_table; pagetable_t phys_table; /* 1:1 pagetable */ diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h index ea17a45aef..a7178f2e3b 100644 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -63,7 +63,7 @@ #define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ #define MSR_FS_BASE 0xc0000100 /* 64bit GS base */ #define MSR_GS_BASE 0xc0000101 /* 64bit FS base */ -#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ +#define MSR_SHADOW_GS_BASE 0xc0000102 /* SwapGS GS shadow */ /* EFER bits: */ #define _EFER_SCE 0 /* SYSCALL/SYSRET */ #define _EFER_LME 8 /* Long mode enable */ diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 30c6079e85..4e4d648480 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -130,8 +130,10 @@ #define TBF_FAILSAFE 16 /* arch_exec_domain' flags values */ -#define TF_failsafe_return 1 -#define TF_kernel_mode 2 +#define _TF_failsafe_return 0 +#define _TF_kernel_mode 1 +#define TF_failsafe_return (1<<_TF_failsafe_return) +#define TF_kernel_mode (1<<_TF_kernel_mode) #ifndef __ASSEMBLY__ diff --git a/xen/include/asm-x86/x86_32/current.h b/xen/include/asm-x86/x86_32/current.h index 3c254191ba..38a3adff61 100644 --- a/xen/include/asm-x86/x86_32/current.h +++ b/xen/include/asm-x86/x86_32/current.h @@ -34,11 +34,11 @@ static inline execution_context_t *get_execution_context(void) } /* - * Get the top-of-stack, as stored in the per-CPU TSS. This is actually - * 20 bytes below the real top of the stack to allow space for: + * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually + * 20 bytes before the real bottom of the stack to allow space for: * domain pointer, DS, ES, FS, GS. */ -static inline unsigned long get_stack_top(void) +static inline unsigned long get_stack_bottom(void) { unsigned long p; __asm__ ( "andl %%esp,%0; addl %2,%0" diff --git a/xen/include/asm-x86/x86_64/current.h b/xen/include/asm-x86/x86_64/current.h index efa170f775..fb5a7abc02 100644 --- a/xen/include/asm-x86/x86_64/current.h +++ b/xen/include/asm-x86/x86_64/current.h @@ -34,11 +34,11 @@ static inline execution_context_t *get_execution_context(void) } /* - * Get the top-of-stack, as stored in the per-CPU TSS. This is actually - * 64 bytes below the real top of the stack to allow space for: + * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually + * 64 bytes before the real bottom of the stack to allow space for: * domain pointer, DS, ES, FS, GS, FS_BASE, GS_BASE_OS, GS_BASE_APP */ -static inline unsigned long get_stack_top(void) +static inline unsigned long get_stack_bottom(void) { unsigned long p; __asm__ ( "orq %%rsp,%0; andq $~7,%0" diff --git a/xen/include/public/arch-x86_64.h b/xen/include/public/arch-x86_64.h index 6ebf988c3c..3f14c3a809 100644 --- a/xen/include/public/arch-x86_64.h +++ b/xen/include/public/arch-x86_64.h @@ -77,12 +77,38 @@ #define HYPERVISOR_VIRT_END (0xFFFF880000000000UL) #endif +#ifndef __ASSEMBLY__ + /* The machine->physical mapping table starts at this address, read-only. */ #ifndef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) #endif -#ifndef __ASSEMBLY__ +/* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) + * @which == SEGBASE_* ; @base == 64-bit base address + * Returns 0 on success. + */ +#define SEGBASE_FS 0 +#define SEGBASE_GS_USER 1 +#define SEGBASE_GS_KERNEL 2 + +/* + * int HYPERVISOR_switch_to_user(void) + * All arguments are on the kernel stack, in the following format. + * Never returns if successful. Current kernel context is lost. + * If flags contains ECF_IN_SYSCALL: + * Restore RIP, RFLAGS, RSP. + * Discard R11, RCX, CS, SS. + * Otherwise: + * Restore R11, RCX, CS:RIP, RFLAGS, SS:RSP. + * All other registers are saved on hypercall entry and restored to user. + */ +struct switch_to_user { + /* Top of stack (%rsp at point of hypercall). */ + u64 r11, rcx, flags, rip, cs, rflags, rsp, ss; + /* Bottom of switch_to_user stack frame. */ +} PACKED; /* NB. Both the following are 64 bits each. */ typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ @@ -136,8 +162,8 @@ typedef struct xen_regs u64 fs; /* Non-zero => takes precedence over fs_base. */ u64 gs; /* Non-zero => takes precedence over gs_base_app. */ u64 fs_base; - u64 gs_base_os; - u64 gs_base_app; + u64 gs_base_kernel; + u64 gs_base_user; } PACKED execution_context_t; typedef u64 tsc_timestamp_t; /* RDTSC timestamp */ diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index c5543a2293..a9222da7b6 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -42,7 +42,7 @@ #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 #define __HYPERVISOR_update_descriptor 10 -#define __HYPERVISOR_set_fast_trap 11 +#define __HYPERVISOR_set_fast_trap 11 /* x86/32 only */ #define __HYPERVISOR_dom_mem_op 12 #define __HYPERVISOR_multicall 13 #define __HYPERVISOR_update_va_mapping 14 @@ -54,8 +54,10 @@ #define __HYPERVISOR_grant_table_op 20 #define __HYPERVISOR_vm_assist 21 #define __HYPERVISOR_update_va_mapping_otherdomain 22 -#define __HYPERVISOR_switch_vm86 23 +#define __HYPERVISOR_switch_vm86 23 /* x86/32 only */ +#define __HYPERVISOR_switch_to_user 23 /* x86/64 only */ #define __HYPERVISOR_boot_vcpu 24 +#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ /* * MULTICALLS @@ -118,6 +120,10 @@ * val[7:0] == MMUEXT_NEW_BASEPTR: * ptr[:2] -- Machine address of new page-table base to install in MMU. * + * val[7:0] == MMUEXT_NEW_USER_BASEPTR: [x86/64 only] + * ptr[:2] -- Machine address of new page-table base to install in MMU + * when in user space. + * * val[7:0] == MMUEXT_TLB_FLUSH: * No additional arguments. * @@ -166,6 +172,7 @@ #define MMUEXT_CLEAR_FOREIGNDOM 11 #define MMUEXT_TRANSFER_PAGE 12 /* ptr = MA of frame; val[31:16] = dom */ #define MMUEXT_REASSIGN_PAGE 13 +#define MMUEXT_NEW_USER_BASEPTR 14 #define MMUEXT_CMD_MASK 255 #define MMUEXT_CMD_SHIFT 8 -- 2.30.2